In [1]:
%pylab
%matplotlib inline
In [2]:
cd ..
In [3]:
import sys
import numpy as np
import skimage
import cv2
import sklearn
import imp
In [4]:
from holoviews import *
In [5]:
import neukrill_net.utils
import neukrill_net.highlevelfeatures
In [6]:
import time
In [7]:
settings = neukrill_net.utils.Settings('settings.json')
In [8]:
X,y = settings.flattened_train_paths(settings.classes)
In [9]:
pkl_names = ['pftas.pkl','contourhistogram.pkl','contourmoments.pkl','haralick.pkl']
In [11]:
t0 = time.time()
hlf = []
XF_list = []
for pkl_name in pkl_names:
tmp = sklearn.externals.joblib.load('cache/'+pkl_name)
hlf += [tmp[0]]
XF_list += [tmp[1]]
print("Loading features took {}".format(time.time()-t0))
In [12]:
XF = np.concatenate(XF_list,2)
In [13]:
XF.shape
Out[13]:
In [14]:
XF[0,0,:]
Out[14]:
In [15]:
import sklearn.naive_bayes
In [16]:
clf = sklearn.naive_bayes.GaussianNB()
In [17]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [19]:
X_new = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=45).fit_transform(XF.squeeze(0), y)
In [20]:
my_X = X_new
clf = sklearn.naive_bayes.GaussianNB()
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
On original
In [21]:
import sklearn.ensemble
In [23]:
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
This is similar to just the Contour Moments and Haralick features
On reduced
In [24]:
my_X = X_new
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
Does slightly worse with fewer features.
Maybe it was too few?
In [25]:
my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [27]:
# Extra trees
my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)
clf = sklearn.ensemble.ExtraTreesClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [28]:
# Adaboost trees
my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)
clf = sklearn.ensemble.AdaBoostClassifier(n_estimators=1000, random_state=42)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
Try DBSCAN
In [29]:
clusterer = sklearn.cluster.DBSCAN()
In [31]:
t0 = time.time()
cluster_pred = clusterer.fit_predict(XF.squeeze(0))
print("Time={}".format(time.time()-t0))
In [34]:
cluster_pred
Out[34]:
It's no good.
Try KMeans
In [35]:
clusterer = sklearn.cluster.MiniBatchKMeans(n_clusters=11, max_iter=100, batch_size=100,
compute_labels=True, random_state=42)
In [36]:
t0 = time.time()
cluster_pred = clusterer.fit_predict(XF.squeeze(0))
print("Time={}".format(time.time()-t0))
In [38]:
cluster_pred
Out[38]:
In [39]:
import scipy.stats
In [43]:
n_classes = len(np.unique(y))
y_ = np.array(y)
class_clusters = np.ones((n_classes)) * -1
for class_index in range(n_classes):
li = (y_ == class_index)
class_clusters[class_index] = scipy.stats.mode(cluster_pred[li])[0]
In [44]:
class_clusters
Out[44]:
In [59]:
num_samples_per_class = [sum(y_ == class_index) for class_index in range(n_classes)]
num_samples_per_class = np.array(num_samples_per_class)
In [61]:
num_samples_per_cluster = np.zeros(11)
for cluster_index in range(11):
li = (class_clusters == cluster_index)
num_samples_per_cluster[cluster_index] = sum(num_samples_per_class[li])
In [62]:
num_samples_per_cluster
Out[62]:
Try to play around with number of classes
In [144]:
clusterer = sklearn.cluster.MiniBatchKMeans(n_clusters=11, max_iter=5000, batch_size=1500,
compute_labels=True, random_state=42)
In [145]:
t0 = time.time()
cluster_pred = clusterer.fit_predict(XF.squeeze(0))
print("Time={}".format(time.time()-t0))
In [146]:
n_classes = len(np.unique(y))
y_ = np.array(y)
class_clusters = np.ones((n_classes)) * -1
for class_index in range(n_classes):
li = (y_ == class_index)
class_clusters[class_index] = scipy.stats.mode(cluster_pred[li])[0]
In [147]:
class_clusters
Out[147]:
In [148]:
n_clusters = len(np.unique(cluster_pred))
num_samples_per_cluster = np.zeros(n_clusters)
for cluster_index in range(n_clusters):
li = (class_clusters == cluster_index)
num_samples_per_cluster[cluster_index] = sum(num_samples_per_class[li])
In [149]:
num_samples_per_cluster
Out[149]:
Try Spectral clustering
In [151]:
clusterer = sklearn.cluster.SpectralClustering(n_clusters=8, random_state=42, n_init=10, n_neighbors=10)
In [152]:
t0 = time.time()
cluster_pred = clusterer.fit_predict(XF.squeeze(0))
print("Time={}".format(time.time()-t0))
In [153]:
clusterer = sklearn.cluster.AgglomerativeClustering(n_clusters=8)
In [ ]:
t0 = time.time()
cluster_pred = clusterer.fit_predict(XF.squeeze(0))
print("Time={}".format(time.time()-t0))
In [ ]:
cluster_pred
In [ ]:
In [ ]:
clf = sklearn.linear_model.LogisticRegression(random_state=42)
In [ ]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [ ]:
XF.squeeze(0)[:,0:1].shape
In [ ]:
len(y)
In [ ]:
# Try SCV on a single feature element from the vector
clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)[:,0:1]), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [ ]:
# Naive Bayes on a single feature element
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)[:,0:1]), y, test_size=0.5, random_state=42)
print("Time={}".format(time.time()-t0))
t0 = time.time()
clf.fit(X_train, y_train)
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [ ]:
clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
one-vs-one
In [ ]:
clf = sklearn.svm.SVC(probability=True, random_state=42)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))